library(devtools)
library(arrow)
library(foreign)
library(conText) #install_github("prodriguezsosa/conText")
library(dplyr)
library(ggplot2)
library(text2vec)
library(tidytext)
library(quanteda)
library(stringr)
library(forcats)
library(gridExtra)
library(xtable)
library(boot)
library(boot.pval)


# set path to working directory
if(Sys.info()['user']=='fionashenbayh'){path_to_data<-"/Users/fionashenbayh/Dropbox/Perceptions_of_courts_project/ICC/JOP submission/Replication Files/data/"}
if(Sys.info()['user']==''){path_to_data<-""} 
setwd(path_to_data)

# set path to save files
#savedfilepath <- '/Users/fionashenbayh/Dropbox/Perceptions_of_courts_project/ICC/JOP submission/Replication Files/figures/'
savedfilepath='/Users/fionashenbayh/Dropbox/Apps/Overleaf/ICC (Copy)/figures/figures_2023'


pre_trained <- readRDS(paste0(path_to_data, 'glove.rds')) # GloVe embeddings
transform_matrix <- readRDS(paste0(path_to_data, 'khodakA.rds')) # transformation matrix
ken_oped <- read.csv('ken_oped_data.csv') # Kenyan media corpus

# format date and month variables
# create month variable
create_year_month <- function(data){
  data <- data %>% 
    group_by(month = lubridate::floor_date(date, "month"))
  return(data)
}

ken_oped$date <- as.Date(ken_oped$date, '%Y-%m-%d')
ken_oped$year <- format(ken_oped$date, '%Y')
ken_oped <- create_year_month(ken_oped)

#### Section A: Text Analysis ####

# following generates figure for disaggregated keyword trends using same model, keyword pairs as shown in Figure 3 in main text

# custom stopword list
custom_stop_words = c('a', 'am', 'an', 'at','as','are', 'and', 'any', 'all',
                      'be', 'because', 'been', 'being', 'both', 'but', 'by',
                      'can', 'cannot', 'could', 'did', 'do', 'does', 'doing',
                      'few', 'for', 'from', 'had', 'has', 'have', 'how',
                      'he', 'she', 'her', 'herself', 'him', 'himself', 'his', 'they', 'them', 'ours', 'our', 'their', 'theirs', 'themselves', 'you', 'your', 'yours','yourself', 'yourselves', 'myself', 'my',
                      'is', 'it', 'itself',
                      'more', 'most', 'me', 'mine', 'must', 
                      'until', 'nonetheless', 'nothing', 'never', 'neither', 'nobody', 'nowhere',
                      'many', 'few', 'some', 'little', 'lot', 'general', 'specific', 'whom', 'been', 'being', 'having', 'does', 'did', 'doing',  
                      'about', 'against', 'between', 'into', 'through', 'during', 
                      'before', 'after', 'above', 'below', 'from', 'down', 
                      'off', 'over', 'under', 'again', 'further', 'once',
                      'not', 'nor', 'no', 'none', 'noone', 'neither',
                      'only', 'off', 'once', 'only', 'own', 'out', 'other',
                      'said', 'say', 'says', 'same', 'should', 'such',
                      'that', 'the', 'then', 'than', 'this', 'those', 'through', 'to', 'too','these',
                      'was', 'were', 'will', 'wont', 'when', 'where', 'who', 'what', 'while', 'whom', 'why', 'with', 'would')

# function to estimate diachronic cosine similarities for given pairing of anchor word and keyword list
get_cosines <- function(df, anchor, word_features, pre_trained, transform_matrix, window_size){
  
  set.seed(23188)
  
  the_corpus <- corpus(df, 
                       docid_field = 'url', text_field = 'body', 
                       meta = c('year', 'month', 'opinion', 'publisher')) #create corpus object from dataframe
  
  the_toks <- tokens(the_corpus, remove_punct = T, remove_numbers = T,remove_symbols = T,
                     remove_separators = T,include_docvars = T) #tokenize corpus
  
  the_toks <- tokens_tolower(the_toks) #pre-processing tokens
  
  the_toks <- tokens_select(the_toks, pattern = custom_stop_words, selection = "remove", min_nchar = 3,
                            padding = TRUE) #pre-processing tokens
  
  toks_cont <- tokens_context(the_toks, 
                              pattern = anchor, 
                              window = window_size, case_insensitive = T) #create tokens context using specified window size
  
  results <- get_cos_sim(toks_cont, 
                         groups = docvars(toks_cont, 'month'),
                         features=word_features,
                         pre_trained=pre_trained,
                         transform=T,
                         transform_matrix=transform_matrix,
                         bootstrap=T,
                         num_bootstraps=100,
                         as_list=FALSE) 
  
  results <- rename(results, date=target)
  results$date <- as.Date(results$date)
  
  return(results)
}

# model from main results Figure 3 #
word_features <- c('politicize', 'persecute', 'victimize', 'bias', 'discriminate', 'unfair')

results <- get_cosines(ken_oped, 
                       word_features = word_features,
                       pre_trained = pre_trained,
                       transform_matrix = transform_matrix,
                       anchor = c('icc'),
                       window_size = 24L)

#### Figure A1: Disaggregated Keyword Trends ####
disagg <- ggplot(results,
                 aes(x=date, y=value, group=feature, color=feature)) + 
  geom_point(size=0.5)+
  geom_smooth(span = .2, inherit.aes=T)+
  facet_wrap(~feature, nrow = 1)+
  geom_hline(yintercept=0, linetype="dotted", color = "black", size=0.5) + 
  labs(y='Cosine Similarity Score', 
       fill='Keyword', title='Cosine Similarity Score between "icc" and political keywords', x="Date of Publication (Monthly Estimates)")+
  scale_colour_viridis_d()+
  scale_x_date(date_breaks = "2 year", date_labels='%Y')+
  theme_minimal()+
  theme(axis.text.x = element_text(angle = 45, size=8), 
        axis.text.y = element_text(size=12), 
        axis.title = element_text(size=14), 
        strip.text = element_text(size=14), legend.position='none')

ggsave(plot = disagg,
       filename = 'figure_A1.jpg',
       path=savedfilepath, 
       device = 'jpg', 
       width = 14, height=8)


# rerun estimates using different context windows

results6 <- get_cosines(ken_oped, 
                        word_features = word_features, pre_trained, transform_matrix = transform_matrix,
                        anchor = c('icc'),
                        window_size = 6L)

results12 <- get_cosines(ken_oped, 
                         word_features = word_features,
                         anchor = c('icc'), pre_trained, transform_matrix = transform_matrix,
                         window_size = 12L)

results24 <- get_cosines(ken_oped, 
                         word_features = word_features, pre_trained, transform_matrix = transform_matrix,
                         anchor = c('icc'),
                         window_size = 24L)

results48 <- get_cosines(ken_oped, 
                         word_features = word_features, pre_trained, transform_matrix = transform_matrix,
                         anchor = c('icc'),
                         window_size = 48L)


plotter <- function(results, main_title){
  ggplot(results,
         aes(x=date, y=value, group=feature, color=feature)) + 
    geom_point(size=0.5)+
    geom_smooth(span = .2, inherit.aes=T)+
    facet_wrap(~feature, nrow = 1)+
    geom_hline(yintercept=0, linetype="dotted", color = "black", size=0.5) + 
    labs(y='', 
         fill='Keyword', title=main_title, x="")+
    scale_colour_viridis_d()+
    scale_x_date(date_breaks = "2 year", date_labels='%Y')+
    theme_minimal()+
    theme(axis.text.x = element_text(angle = 45, size=8), 
          axis.text.y = element_text(size=12), 
          axis.title = element_text(size=14), 
          strip.text = element_text(size=14), legend.position='none')
}

six <- plotter(results6, main_title='Context window: 6 words')
twelve <- plotter(results12, main_title='Context window: 12 words')
twentyfour <- plotter(results24, main_title='Context window: 24 words')
fortyeight <- plotter(results48, main_title='Context window: 48 words')


#### Figure A2: Varying window size ####
windows_plot <- grid.arrange(six, twelve, twentyfour, fortyeight, 
                             ncol=1, 
                             top='Cosine Similarity Score between "icc" and political keywords',
                             left='Cosine Similarity Score', bottom='Date of Publication (Monthly Estimates)')

ggsave(plot=windows_plot,
       filename = 'figure_A2.jpg',
       path=savedfilepath, 
       device = 'jpg',
       width = 10, height=12)


#### Table A1: First Change Point ####

cpd_check <- function(results){
  
  set.seed(20010)
  
  agg_results <- 
    results %>%
    group_by(date) %>% 
    summarize(value = mean(value))
  
  # calculate cumulative sums
  all_cum_sum <- data.frame(date = agg_results$date, cum_sums = cumsum(agg_results$value-mean(agg_results$value)))
  s_diff <- max(all_cum_sum$cum_sums)-min(all_cum_sum$cum_sums)
  cp1 <- all_cum_sum$date[which.max(abs(all_cum_sum$cum_sums))] #CUSUM ESTIMATOR
  
  s_diff_fun <- function(d, inds){
    reordered <- d[inds,]$value
    cum_sums = cumsum(reordered-mean(reordered))
    sim_s_diff <- max(cum_sums)-min(cum_sums)
    return(sim_s_diff)
  }
  
  set.seed(20010)
  bootdiff <- boot(data = agg_results, statistic = s_diff_fun, R = 10000)
  booted_t <- as.data.frame(bootdiff$t)   
  
  pval <- boot.pval(bootdiff, type='perc', theta_null = bootdiff$t0)
  
  return(tibble(cp1, pval))
}


r6 <- cpd_check(results6)
r12 <- cpd_check(results12)
r24 <- cpd_check(results24)
r48 <- cpd_check(results48)

r6 <- plyr::rename(r6, c(cp1 = 'change point', pval = 'p-val'))
r12 <- plyr::rename(r12, c(cp1 = 'change point', pval = 'p-val'))
r24 <- plyr::rename(r24, c(cp1 = 'change point', pval = 'p-val'))
r48 <- plyr::rename(r48, c(cp1 = 'change point', pval = 'p-val'))

windows <- rbind(r6, r12, r24, r48)
windows$window <- c(6, 12, 24, 48)
windows <- windows %>% relocate(window)

xtable <- function(x, ...) {
  for (i in which(sapply(x, function(y) !all(is.na(match(c("POSIXt","Date"),class(y))))))) x[[i]] <- as.character(x[[i]])
  xtable::xtable(x, ...)
}

xtable(windows, digits=4)


#### Table A2: Second Change Point ####

cpd2_check <- function(results){
  
  set.seed(23188)
  
  agg_results <- 
    results %>%
    group_by(date) %>% 
    summarize(value = mean(value))
  
  # calculate cumulative sums
  all_cum_sum <- data.frame(date = agg_results$date, cum_sums = cumsum(agg_results$value-mean(agg_results$value)))
  s_diff <- max(all_cum_sum$cum_sums)-min(all_cum_sum$cum_sums)
  cp1 <- all_cum_sum$date[which.max(abs(all_cum_sum$cum_sums))] #CUSUM ESTIMATOR
  
  s_diff_fun <- function(d, inds){
    reordered <- d[inds,]$value
    cum_sums = cumsum(reordered-mean(reordered))
    sim_s_diff <- max(cum_sums)-min(cum_sums)
    return(sim_s_diff)
  }
  
  pre <- agg_results[agg_results$date < cp1,]
  post <- agg_results[agg_results$date > cp1,]  
  
  bootdiff_pre <- boot(data = pre, statistic = s_diff_fun, R = 10000)  
  bootdiff_post <- boot(data = post, statistic = s_diff_fun, R = 10000)  
  
  pval1 <- boot.pval(bootdiff_pre, type='perc', theta_null = bootdiff_pre$t0)
  pval2 <- boot.pval(bootdiff_post, type='perc', theta_null = bootdiff_post$t0)
  
  change_point <- function(sample){
    x <- data.frame(date = sample$date, cum_sums = cumsum(sample$value-mean(sample$value)))
    s_diff <- max(x$cum_sums)-min(x$cum_sums)
    cp <- x$date[which.max(abs(x$cum_sums))] 
    return(cp)#CUMSUM ESTIMATOR
  }
  
  cp2 <- change_point(pre)
  cp2.2 <- change_point(post)
  
  output1 <- tibble(cp2, pval1)
  output1 <- plyr::rename(output1, c('pval1' = 'p-val', 'cp2' = 'change point'))
  output2 <- tibble(cp2.2, pval2)
  output2 <-plyr::rename(output2, c('pval2' = 'p-val', 'cp2.2' = 'change point'))
  
  return(dplyr::bind_rows(output1, output2))
  
}

r6.2 <- cpd2_check(results6)
r12.2 <- cpd2_check(results12)
r24.2 <- cpd2_check(results24)
r48.2 <- cpd2_check(results48)

windows2 <- rbind(r6.2, r12.2, r24.2, r48.2)
windows2$window <- c(6, 6, 12, 12, 24, 24, 48, 48)
windows2 <- windows2 %>% relocate(window)

xtable(windows2, digits=4)


#### Nearest Neighbors and Contexts Analyses ####

ken_corpus <- corpus(ken_oped, 
                     docid_field = 'url', text_field = 'body', 
                     meta = c('year', 'month', 'opinion', 'publisher'))

ken_toks <- tokens(ken_corpus, 
                   remove_punct = T, 
                   remove_numbers = T,
                   remove_symbols = T,
                   remove_separators = T,
                   include_docvars = T)

ken_toks <- tokens_tolower(ken_toks)

toks_nostop <- tokens_select(ken_toks, pattern = custom_stop_words, selection = "remove", min_nchar = 3)

icc_toks <- tokens_context(toks_nostop, 
                           pattern = c('icc'), 
                           window = 24L, 
                           case_insensitive = T, 
                           verbose = FALSE)

feats <- dfm(icc_toks)
icc_dem <- dem(x = feats,
               pre_trained = pre_trained,
               transform = T,
               transform_matrix = transform_matrix,
               verbose = T)

# to get group-specific embeddings, average within category
icc_wv_pub <- dem_group(icc_dem, groups = icc_dem@docvars$publisher)  #publisher
icc_wv_year <- dem_group(icc_dem, groups = icc_dem@docvars$year)      #year

# find nearest neighbors by category
# setting as_list = FALSE combines each group's results into a single tibble (useful for joint plotting)
icc_nns_pub <- nns(icc_wv_pub, pre_trained, N = 5, candidates = icc_wv_pub@features, as_list = F)
icc_nns_year <- nns(icc_wv_year, pre_trained, N = 5, candidates = icc_wv_year@features, as_list = F)


#### Table A3: Nearest Neighbors by Publisher ####

nns_results_pub <- icc_nns_pub %>% 
  select(c('target', 'feature', 'value'))%>%
  arrange(target, desc(value))

nns_results_pub <- dplyr::rename(nns_results_pub, c(Publisher = target, Neighbors = feature, Cosine_Similarity = value))

print(xtable(nns_results_pub, label = 'table:nns'), include.rownames=FALSE)


#### Table A4: Nearest Neighbors by Year ####
nns_results_year <- icc_nns_year %>%
  select(c('target', 'feature', 'value'))%>%
  arrange(target, desc(value))

nns_results_year <- dplyr::rename(nns_results_year, c(Year = target, Neighbors = feature, Cosine_Similarity = value))

print(xtable(nns_results_year, label = 'table:nns'), include.rownames=FALSE)


#### Table A5: Nearest Contexts ####

# find nearest contexts for target `icc' within each publisher
set.seed(20010)
icc_ncs_capital <- ncs(x = icc_wv_pub["Capital FM (Nairobi)",], 
                       contexts_dem = icc_dem[icc_dem@docvars$publisher == "Capital FM (Nairobi)",], 
                       contexts = icc_toks, N = 2, as_list = FALSE)
icc_ncs_standard <- ncs(x = icc_wv_pub["The East African Standard (Nairobi)",], 
                        contexts_dem = icc_dem[icc_dem@docvars$publisher == "The East African Standard (Nairobi)",], 
                        contexts = icc_toks, N = 2, as_list = FALSE)
icc_ncs_star <- ncs(x = icc_wv_pub["The Star (Nairobi)",], 
                    contexts_dem = icc_dem[icc_dem@docvars$publisher == "The Star (Nairobi)",], 
                    contexts = icc_toks, N = 2, as_list = FALSE)
icc_ncs_nation <- ncs(x = icc_wv_pub["The Nation (Nairobi)",], 
                      contexts_dem = icc_dem[icc_dem@docvars$publisher == "The Nation (Nairobi)",], 
                      contexts = icc_toks, N = 2, as_list = FALSE)

# add publisher names
icc_ncs_capital$target <- "Capital FM"
icc_ncs_nation$target <- "The Nation"
icc_ncs_standard$target <- "The East African Standard"
icc_ncs_star$target <- "The Star"

# join and print output
ncs_all <- rbind(icc_ncs_capital, icc_ncs_standard, icc_ncs_nation, icc_ncs_star)
ncs_all <- ncs_all %>%
  select(c('target', 'context', 'value'))%>%
  arrange(target, desc(value))
ncs_all <- dplyr::rename(ncs_all, c(Publisher = target, Context = context, Cosine_Similarity = value))
print(xtable(ncs_all, label='ncs_all'),include.rownames=FALSE)


#### Figure A3: Alternative Anchors ####

ken_oped_2 <- ken_oped

ken_oped_2$body <- str_replace_all(ken_oped_2$body, 'Hague|hague|ocampo|Ocampo|Bensouda|bensouda', 'icc')

set.seed(23188)
results_2 <- get_cosines(ken_oped_2, 
                         word_features = word_features,
                         pre_trained = pre_trained,
                         transform_matrix = transform_matrix,
                         anchor = c('icc'), window_size = 24L)

agg_results_2 <- 
  results_2 %>%
  group_by(date) %>% 
  summarize(value = mean(value))

ggplot(agg_results_2,
       aes(x=date, y=value)) + 
  geom_point(size=0.5)+
  geom_smooth(
    span = .15, 
    inherit.aes=T)+
  geom_hline(yintercept=0, linetype="dotted", color = "black", size=0.5) + 
  labs(y='Cosine Similarity Score', 
       fill='Keyword', title='Cosine similarity score between alternative signifiers for the ICC and political keywords', x="Date of Publication (Monthly Estimates)")+
  scale_colour_viridis_d()+
  scale_x_date(date_breaks = "1 year", date_labels='%Y')+
  theme_minimal()+
  theme(axis.text.x = element_text(angle = 45, size=16), 
        axis.text.y = element_text(size=16), 
        axis.title = element_text(size=16), 
        title = element_text(size=16),
        strip.text = element_text(size=16), legend.position='none')

ggsave(
  filename = 'figure_A3.jpg',
  path=savedfilepath, 
  device = 'jpg', 
  width = 14, height=8)

cpd_check(results_2)
cpd2_check(results_2)

#### Figure A4: Locally Trained Embeddings #### 

# construct feature co-occurrence matrix
toks_fcm <- fcm(toks_nostop, context = "window", window = 6, count = "frequency", tri = FALSE)

local_glove <- readRDS(file = paste0(path_to_data, 'local_glove.rds'))
local_transform <- readRDS(file = paste0(path_to_data, 'local_transform.rds'))

# create document-embedding matrix using our locally trained GloVe embeddings and transformation matrix
icc_dem_local <- dem(x = feats,
                     pre_trained = local_glove, transform = TRUE, 
                     transform_matrix = local_transform, verbose = TRUE)


# take column average to get single 'corpus-wide' embedding
icc_wv_local <- colMeans(icc_dem_local)

# Rerunning main results
set.seed(23188)
results_local <- get_cosines(ken_oped, 
                             word_features = word_features,
                             pre_trained = local_glove,
                             transform_matrix = local_transform,
                             anchor = c('icc'), window_size = 6L)

res_loc <- ggplot(results_local,
                  aes(x=date, y=value, group=feature, color=feature)) + 
  geom_point(size=0.5)+
  geom_smooth(span = .2, inherit.aes=T)+
  facet_wrap(~feature, nrow = 1)+
  geom_hline(yintercept=0, linetype="dotted", color = "black", size=0.5) + 
  labs(y='Cosine Similarity Score', 
       fill='Keyword', title='Cosine Similarity Score between "icc" and political keywords', x="Date of Publication (Monthly Estimates)")+
  scale_colour_viridis_d()+
  scale_x_date(date_breaks = "2 year", date_labels='%Y')+
  theme_minimal()+
  theme(axis.text.x = element_text(angle = 45, size=8), 
        axis.text.y = element_text(size=12), 
        axis.title = element_text(size=14), 
        strip.text = element_text(size=14), legend.position='none')


ggsave(plot=res_loc,
       filename = 'figure_A4.jpg',
       path=savedfilepath, 
       device = 'jpg', 
       width = 14, height=8)

cpd_check(results_local)

#### Section B: CPD Algorithm -- Estimating additional change points ####

# When a significant change point has been estimated, split sample at change point
# Repeat CUSUM estimation procedure for pre- and post- sample
# Algorithm terminates if estimated change points are statistically insignificant

# main analysis repeated here to generate estimates used for CPD algorithm

word_features <- c('politicize', 'persecute', 'victimize', 'bias', 'discriminate', 'unfair')

results <- get_cosines(ken_oped, word_features = word_features, pre_trained = pre_trained, transform_matrix = transform_matrix, anchor = c('icc'), window_size = 24L)

# group by date, calculate mean cosine similarity within group
agg_results <- 
  results %>%
  group_by(date) %>% 
  summarize(value = mean(value))

# First change point (repeated from main analysis)
all_cum_sum <- data.frame(date = agg_results$date, cum_sums = cumsum(agg_results$value-mean(agg_results$value)))
s_diff <- max(all_cum_sum$cum_sums)-min(all_cum_sum$cum_sums)
cp1 <- all_cum_sum$date[which.max(abs(all_cum_sum$cum_sums))] #CUSUM ESTIMATOR

# because cp1 is statistically significant (see main analysis), repeat CUSUM estimation procedure to find second change point
pre <- agg_results[agg_results$date < cp1,]
post <- agg_results[agg_results$date > cp1,]

change_point <- function(sample){
  x <- data.frame(date = sample$date, cum_sums = cumsum(sample$value-mean(sample$value)))
  s_diff <- max(x$cum_sums)-min(x$cum_sums)
  cp <- x$date[which.max(abs(x$cum_sums))] 
  return(cp) #CUMSUM ESTIMATOR
}

change_point(pre)
change_point(post)

# cusum estimator function
s_diff_fun <- function(d, inds){
  reordered <- d[inds,]$value
  cum_sums = cumsum(reordered-mean(reordered))
  sim_s_diff <- max(cum_sums)-min(cum_sums)
  return(sim_s_diff)
}

set.seed(20010)
bootdiff_pre <- boot(data = pre, statistic = s_diff_fun, R = 10000)
booted_t_pre <- as.data.frame(bootdiff_pre$t)

bootdiff_post <- boot(data = post, statistic = s_diff_fun, R = 10000)
booted_t_post <- as.data.frame(bootdiff_post$t) 

cpd_plot <- function(sample){
  x <- data.frame(date = sample$date, cum_sums = cumsum(sample$value-mean(sample$value)))
  s_diff <- max(x$cum_sums)-min(x$cum_sums)
  change_point_2 <- x$date[which.max(abs(x$cum_sums))]
  set.seed(20010)
  bootdiff <- boot(data = sample, statistic = s_diff_fun, R = 10000)
  booted_t <- as.data.frame(bootdiff$t) 
  booted2 <- ggplot(booted_t)+
    geom_histogram(mapping = aes(V1), bins = 100, color="black", fill="white") + 
    geom_vline(xintercept =  bootdiff$t0, color='red', linetype='dashed')+
    geom_text(aes(x=bootdiff$t0+.05, label="S[diff]", y=350), colour="red", angle=0)+
    labs(x='t-stat', y='frequency', title='Bootstrapped t-stat: 10,000 simulations')+
    theme_minimal()+
    theme(axis.text.x = element_text(angle = 0, size=12), 
          axis.text.y = element_text(size=12), 
          axis.title = element_text(size=14),
          title = element_text(size=14),
          strip.text = element_text(size=14))
  
  changepoint2 <- ggplot(x, aes(x=date, y=cum_sums))+
    geom_point(size=0.5)+
    geom_line()+
    geom_hline(yintercept=0, linetype="dotted", color = "black", size=0.5) + 
    geom_vline(xintercept = change_point_2, linetype='dashed', color='red')+
    geom_text(aes(x=change_point_2+3, label="S[m]", y=0.5), colour="red", angle=0)+
    labs(title='Estimated changepoint', x="", y="Cumulative Sums")+theme_minimal()+
    theme(axis.text.x = element_text(angle = 0, size=12), 
          axis.text.y = element_text(size=12), 
          axis.title = element_text(size=14),
          title = element_text(size=14),
          strip.text = element_text(size=14))
  p2 <- grid.arrange(changepoint2, booted2, nrow=1)
  return(p2)
}


#### Figure B1 ####
pre_plot <- cpd_plot(pre)
ggsave(filename='figure_B1.jpg', plot = pre_plot, width = 12, height=6,
       path=savedfilepath, 
       device = 'jpg')

#### Figure B2 ####
post_plot <- cpd_plot(post)
ggsave(filename='figure_B2.jpg', plot = post_plot, width = 12, height=6,
       path=savedfilepath, 
       device = 'jpg')


#### Section C: Publisher Analysis ####

#### Main Model estimates without The Standard ####

ken_wo_ea <- ken_oped[ken_oped$publisher!="The East African Standard (Nairobi)",]

results_wo_ea <- get_cosines(ken_wo_ea, 
                             word_features,
                             anchor = 'icc', 
                             pre_trained = pre_trained, 
                             transform_matrix = transform_matrix,
                             window_size = 24L)

cpd_check(results_wo_ea)

#### Table C1: Distribution of Articles across Kenyan Publishers ####

pubfreq <- ken_oped %>%
  group_by(publisher) %>%
  summarise(n = n()) %>%
  mutate(Freq = n/sum(n))

pubfreq$"Media Ownership" <- c("Capital Group",
                               "Standard Group",
                               "Nation Media Group",
                               "Radio Africa Group")

pubfreq <- pubfreq %>% rename(Count = n, Proportion = Freq, Publisher = publisher)
pubfreq <- pubfreq[, c("Publisher", "Media Ownership", "Count", "Proportion")] 

xtable(pubfreq)

# Publisher Embeddings #
ken_corpus <- corpus(ken_oped, 
                     docid_field = 'url', text_field = 'body', 
                     meta = c('year', 'month', 'opinion', 'publisher'))

ken_toks <- tokens(ken_corpus, 
                   remove_punct = T, 
                   remove_numbers = T,
                   remove_symbols = T,
                   remove_separators = T,
                   include_docvars = T,
)

mean(ntoken(ken_toks))

ken_toks <- tokens_tolower(ken_toks)

toks_nostop <- tokens_select(ken_toks, pattern = custom_stop_words, selection = "remove", min_nchar = 3)

icc_toks <- tokens_context(toks_nostop, 
                           pattern = c('icc'), 
                           window = 24L, 
                           case_insensitive = T, 
                           verbose = FALSE)

publishers <- get_cos_sim(icc_toks, 
                          groups = docvars(icc_toks, 'publisher'),
                          features=c(
                            'politicize', 'persecute','victimize',
                            'bias', 'discriminate', 'unfair'), 
                          pre_trained=pre_trained,
                          transform=T,
                          transform_matrix=transform_matrix,
                          bootstrap=T,
                          num_bootstraps=100,
                          as_list=FALSE)

levels(publishers$target) <- c("Capital FM", "East African\nStandard", "The Nation", "The Star")

publishers <- publishers %>%
  mutate(pub = fct_relevel(target, 
                           "The Nation", "The Star", "Capital FM", "East African\nStandard",))

#### Figure C3: Publishers ####
pubplot <- ggplot(publishers, aes(x=pub, y=value, group=feature, fill=feature)) + 
  geom_bar(stat = 'identity', position = 'dodge')+
  geom_errorbar(aes(ymin = value - std.error, ymax = value + std.error), alpha=0.5, position = 'dodge', color='black')+
  labs(title='Cosine Similarity Between "icc" and Keyword', 
       fill='Keyword', y='Cosine Similarity Score', x="")+
  theme_minimal()+ scale_fill_viridis_d()+
  theme(axis.text.x = element_text(angle = 0, size=14, vjust = .5), axis.text.y = element_text(size=14),
        axis.title = element_text(size=14),
        title = element_text(size=14),
        legend.text = element_text(size=14))

ggsave(filename = 'figure_C3_publishers.jpg', plot = pubplot,
       width=9, height=6,
       path=savedfilepath, 
       device = 'jpg')


#### Table C2: Type of editorial content ####

opfreq <- ken_oped %>%
  filter(complete.cases(opinion))%>%
  group_by(opinion) %>%
  summarise(n = n()) %>%
  mutate(Freq = n/sum(n))

opfreq$opinion <- c("Non-Opinion", "Opinion")

opfreq <- opfreq %>% rename('Article-type' = opinion, Count = n, Proportion = Freq)

xtable(opfreq)

# opinion embeddings #
opeds <- get_cos_sim(icc_toks, 
                     groups = docvars(icc_toks, 'opinion'),
                     features=c(
                       'politicize', 'persecute','victimize',
                       'bias', 'discriminate', 'unfair'),
                     pre_trained=pre_trained,
                     transform=T,
                     transform_matrix=transform_matrix,
                     bootstrap=T,
                     num_bootstraps=100,
                     as_list=FALSE)

opeds$target <- as.factor(opeds$target)

levels(opeds$target) <- c("Non-Opinion", "Opinion")

opeds <- na.omit(opeds)

#### Figure C3: Opeds ####

opedsplot <- ggplot(opeds, aes(x=as.factor(target), y=value, group=feature, fill=feature)) + 
  geom_bar(stat = 'identity', position = 'dodge', width=0.5)+
  geom_errorbar(aes(ymin = value - std.error, ymax = value + std.error, width=0.5), alpha=0.5, position = 'dodge')+
  labs(title='Cosine Similarity Between "icc" and Keyword', 
       fill='Keyword', y='Cosine Similarity Score', x="")+
  scale_fill_viridis_d()+
  theme_minimal()+
  theme(axis.text.x = element_text(angle = 0, size=14, vjust = .5), axis.text.y = element_text(size=14),
        axis.title = element_text(size=14),
        legend.text = element_text(size=14), 
        title = element_text(size=14))

ggsave(filename = 'figure_C3_opeds.jpg', plot= opedsplot,
       width=9, height=6,
       path=savedfilepath, 
       device = 'jpg')


#### Table C3: Distribution of Author-Types (Opinion Pieces)####

authopfreq <- ken_oped %>%
  filter(opinion==1, complete.cases(author_type))%>%
  group_by(author_type) %>%
  summarise(n = n()) %>%
  mutate(Freq = n/sum(n))


authopfreq$author_type <- c('unknown author', 
                            'journalist or editorialboard', 
                            'domestic government official',
                            'international government official', 
                            'international or regional org',
                            'civil society (domestic)',
                            'civil society (international)')

authopfreq <- authopfreq %>% rename('Author-Type' = author_type, Count = n, Proportion = Freq)

xtable(authopfreq)

# Group embeddings by oped author-type
opeds_author <- get_cos_sim(icc_toks, 
                            groups = docvars(icc_toks, 'author_type'),
                            features=c(
                              'politicize', 'persecute','victimize',
                              'bias', 'discriminate', 'unfair'),
                            pre_trained=pre_trained,
                            transform=T,
                            transform_matrix=transform_matrix,
                            bootstrap=T,
                            num_bootstraps=100,
                            as_list=FALSE)

opeds_author <- na.omit(opeds_author)

opeds_author$target <- as.factor(opeds_author$target)

levels(opeds_author$target) <- c('unknown\nauthor', 
                                 'journalist\nor editorial\nboard', 
                                 'domestic\ngovernment\nofficial',
                                 'international\ngovernment\nofficial', 
                                 'international\nor regional org',
                                 'civil society\n(domestic)',
                                 'civil society\n(international)')

opeds_author <- opeds_author %>%
  mutate(author = fct_relevel(target, 
                              'journalist\nor editorial\nboard', 
                              'domestic\ngovernment\nofficial',
                              'international\ngovernment\nofficial', 
                              'international\nor regional org',
                              'civil society\n(domestic)',
                              'civil society\n(international)', 'unknown\nauthor'))

#### Figure C4: Author-Types ####

authorsplot <- ggplot(opeds_author, aes(x=author, y=value, group=feature, fill=feature)) + 
  geom_bar(stat = 'identity', position = 'dodge', width=0.5)+
  geom_errorbar(aes(ymin = value - std.error, ymax = value + std.error, width=0.5), alpha=0.5, position = 'dodge')+
  labs(title='Cosine Similarity Between "icc" and Keyword', 
       fill='Keyword', y='Cosine Similarity Score', x="")+
  scale_fill_viridis_d()+
  theme_minimal()+
  theme(axis.text.x = element_text(angle = 0, size=10, vjust = .5),
        axis.title = element_text(size=12),
        legend.text = element_text(size=12), 
        title = element_text(size=14))


ggsave(filename = 'figure_C4.jpg', plot=authorsplot,
       width=10, height=6,
       path=savedfilepath, 
       device = 'jpg')
